The data is accessible from 2017 to today, we used four entire years of data (2017, 2018, 2019, 2020) to match AIS records.
Code
# Set the path to the 2016 folder#path <- "Data/SAR Vessel detections 2017-2020"# List all CSV files in the folder#SAR_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE)# Read all CSV files and combine them into a single data frame#SAR_fishing <- SAR_csv_files %>%# map_df(~read_csv(.))load(here::here("Data","SAR_fishing.Rdata"))# Aggregate fishing hours by latitude and longitudeaggregated_SAR_fishing <- SAR_fishing %>%mutate(lat_rounded =round(lat, digits =2),lon_rounded =round(lon, digits =2) ) %>%group_by(lat_rounded, lon_rounded) %>%filter(fishing_score >=0.9) %>%summarise(total_presence_score =sum(presence_score, na.rm =TRUE),avg_fishing_score =mean(fishing_score, na.rm =TRUE),count =n() ) %>%mutate(total_presence_score =round(total_presence_score, digits =0)) %>%ungroup()# Standardize and aggregate SAR dataSAR_data_std <- aggregated_SAR_fishing %>%mutate(coords =map2(lon_rounded, lat_rounded, standardize_coords)) %>%mutate(lon_std =map_dbl(coords, ~ .x$lon_std),lat_std =map_dbl(coords, ~ .x$lat_std) ) %>%group_by(lon_std, lat_std) %>%summarise(total_presence_score =sum(total_presence_score, na.rm =TRUE), .groups ="drop")# Create the world mapworld_map <-map_data("world")# Create the plotggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = SAR_data_std, aes(x = lon_std, y = lat_std, fill = total_presence_score)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="Fishing vessel detections (2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title =NULL,x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )
Compare AIS, and SAR detection locations
Identifying grid cells with only AIS, only SAR detections or both data types.
Code
# Merge the datasetscombined_data <-full_join( AIS_data_std, SAR_data_std,by =c("lon_std", "lat_std"))# Categorize each cellcombined_data <- combined_data %>%mutate(category =case_when( total_fishing_hours >0& total_presence_score >0~"Both AIS and SAR", total_fishing_hours >0& (is.na(total_presence_score) | total_presence_score ==0) ~"Only AIS", (is.na(total_fishing_hours) | total_fishing_hours ==0) & total_presence_score >0~"Only SAR",TRUE~"No fishing detected" ))# Create the world mapworld_map <-map_data("world")# Create the plotworld_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data, aes(x = lon_std, y = lat_std, fill = category)) +scale_fill_manual(values =c("Both AIS and SAR"="purple", "Only AIS"="blue", "Only SAR"="red", "No fishing detected"="white"),name ="Fishing Data Source",guide =guide_legend(title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Detection",subtitle ="Comparison of AIS (2017-2020) and SAR (2017-2020) data at 0.1-degree resolution",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )# Display the plotprint(world_plot)
Code
# Get summary statisticssummary_stats <- combined_data %>%count(category) %>%mutate(percentage = n /sum(n) *100) %>%rename(`Number of cells`= n) %>%mutate(percentage =round(percentage, 2))# Create and display the tablekable(summary_stats, format ="html", col.names =c("Category", "Number of cells", "Percentage (%)"),caption ="Summary Statistics of Data Categories") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(2, width ="150px") %>%column_spec(3, width ="150px")
Summary Statistics of Data Categories
Category
Number of cells
Percentage (%)
Both AIS and SAR
163095
9.12
Only AIS
1566190
87.60
Only SAR
58668
3.28
Random forest model
Predicting fishing hours in areas with only SAR data at a 0.1 degree resolution. Using a random forest model of >160,000 associations between SAR vessel detections and AIS fishing hours globally, geographical coordinates, distance to ports, distance to shore and bathymetry.
Code
# Open the saved adjusted rastersPorts_adjusted <-raster(here::here("Data", "Environmental data layers", "distance-from-port-0.1deg-adjusted.tif"))Shore_adjusted <-raster(here::here("Data", "Environmental data layers", "distance-from-shore-0.1deg-adjusted.tif"))Bathy_adjusted <-raster(here::here("Data", "Environmental data layers", "bathymetry-0.1deg-adjusted.tif"))# Stack the resampled rastersraster_stack <-stack(Shore_adjusted, Ports_adjusted, Bathy_adjusted)# Convert the stack to a dataframeraster_df <-as.data.frame(raster_stack, xy =TRUE)# Rename the columnsnames(raster_df) <-c("x", "y", "dist_shore", "dist_ports", "bathy")# Remove NA values if desiredraster_df <-na.omit(raster_df)# Convert to data.table for efficiencysetDT(raster_df)# Round x and y to 1 decimal place for consistencyraster_df[, `:=`(lon_std =round(x, digits =1),lat_std =round(y, digits =1))]# Now proceed with the join and model trainingload(here::here("data","combined_data_O1deg.Rdata"))setDT(combined_data_01deg)# Perform the join using data.tablecombined_data_with_rasters <- raster_df[combined_data_01deg, on = .(lon_std, lat_std), nomatch =0]# Convert back to dataframe if neededcombined_data_with_rasters <-as.data.frame(combined_data_with_rasters)# Prepare the training datatraining_data <- combined_data_with_rasters %>%filter(has_AIS & has_SAR) %>% dplyr::select(total_fishing_hours, total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%na.omit()#save(training_data, file=here::here("training_data.Rdata"))# Train the random forest model with timing#set.seed(123) # for reproducibility#rf_timing <- system.time({# rf_model_env <- randomForest(# total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = training_data,# ntree = 500,# importance = TRUE# )#})#cat("Random Forest model training time:", rf_timing["elapsed"], "seconds\n")# Save the new model#save(rf_model_env, file = "rf_model_01deg_with_rasters.Rdata")load(here::here("rf_model_01deg_with_rasters.Rdata"))
Comparison of transformations in models
Code
# Prepare the dataload(here::here("training_data.Rdata"))#training_data_log <- training_data %>%# mutate(# log_total_presence_score = log10(total_presence_score + 1),# log_total_fishing_hours = log10(total_fishing_hours + 1)# )# Function to run a single model#run_model <- function(formula, data) {# randomForest(# formula,# data = data,# ntree = 500,# importance = TRUE# )#}# Set up parallel processing#num_cores <- detectCores() - 1 # Use all but one core#cl <- makeCluster(num_cores)# Export necessary objects to the cluster#clusterExport(cl, c("training_data_log", "run_model"))# Load required packages on each cluster#clusterEvalQ(cl, library(randomForest))# Define the models#models <- list(# no_transform = as.formula(total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + #dist_ports + bathy),# original = as.formula(total_fishing_hours ~ log_total_presence_score + lon_std + lat_std + dist_shore + #dist_ports + bathy),# log = as.formula(log_total_fishing_hours ~ log_total_presence_score + lon_std + lat_std + dist_shore + #dist_ports + bathy)#)# Run models in parallel#results <- parLapply(cl, models, function(formula) run_model(formula, training_data_log))# Stop the cluster#stopCluster(cl)# Save the models#rf_model_no_transform <- results[[1]]#rf_model_original <- results[[2]]#rf_model_log <- results[[3]]# Save models to files#saveRDS(rf_model_no_transform, "rf_model_no_transform.rds")#saveRDS(rf_model_original, "rf_model_original.rds")#saveRDS(rf_model_log, "rf_model_log.rds")# Add the new model with only total_fishing_hours log-transformed#rf_model_fishing_log <- randomForest(# log_total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = training_data_log,# ntree = 500,# importance = TRUE#)# Save the new model#saveRDS(rf_model_fishing_log, "rf_model_fishing_log.rds")# Load the saved modelsrf_model_no_transform <-readRDS(here::here("rf_model_no_transform.rds"))rf_model_original <-readRDS(here::here("rf_model_original.rds"))rf_model_log <-readRDS(here::here("rf_model_log.rds"))rf_model_fishing_log <-readRDS(here::here("rf_model_fishing_log.rds"))# Function to evaluate modelsevaluate_model <-function(model, data, log_target =FALSE) { predictions <-predict(model, newdata = data)if (log_target) { predictions <-10^predictions -1 } actual <- data$total_fishing_hours# Basic Error Metrics mae <-mean(abs(actual - predictions), na.rm =TRUE) rmse <-sqrt(mean((actual - predictions)^2, na.rm =TRUE)) mape <-mean(abs((actual - predictions) / actual) *100, na.rm =TRUE) medae <-median(abs(actual - predictions), na.rm =TRUE)# R-squared (matching randomForest's % Var explained) r_squared <- model$rsq[length(model$rsq)]# Adjusted R-squared n <-length(actual) p <-length(model$forest$independent.variable.names) # Number of predictors adj_r_squared <-1- ((1- r_squared) * (n -1) / (n - p -1))# Residual Analysis residuals <- actual - predictions mean_residual <-mean(residuals, na.rm =TRUE) sd_residual <-sd(residuals, na.rm =TRUE)# Feature Importance (for Random Forest) feature_importance <-importance(model)return(list("Mean Absolute Error"= mae,"Root Mean Squared Error"= rmse,"Mean Absolute Percentage Error"= mape,"Median Absolute Error"= medae,"R-Squared"= r_squared,"Adjusted R-Squared"= adj_r_squared,"Mean of Residuals"= mean_residual,"Standard Deviation of Residuals"= sd_residual,"Feature Importance"= feature_importance ))}# Evaluate all modelsvalidation_data <- combined_data_with_rasters %>%mutate(data_category =case_when( has_AIS & has_SAR ~"Both AIS and SAR", has_AIS &!has_SAR ~"Only AIS",!has_AIS & has_SAR ~"Only SAR",TRUE~"No fishing detected" ) )validation_data <- validation_data %>%filter(data_category =="Both AIS and SAR")# Evaluate all modelsresults_no_transform <-evaluate_model(rf_model_no_transform, validation_data)validation_data_logpres <- validation_data %>%mutate(log_total_presence_score =log10(total_presence_score +1))results_original <-evaluate_model(rf_model_original, validation_data_logpres)# Add evaluation for the new model (fishing hours log-transformed)results_fishing_log <-evaluate_model(rf_model_fishing_log, validation_data, log_target =TRUE)results_log <-evaluate_model(rf_model_log, validation_data_logpres, log_target =TRUE)# Create a data frame with the resultsresults_df <-data.frame(Metric =c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error", "Median Absolute Error", "R-Squared", "Adjusted R-Squared", "Mean of Residuals", "Standard Deviation of Residuals"),No_Transform =c(results_no_transform$`Mean Absolute Error`, results_no_transform$`Root Mean Squared Error`, results_no_transform$`Mean Absolute Percentage Error`, results_no_transform$`Median Absolute Error`, results_no_transform$`R-Squared`, results_no_transform$`Adjusted R-Squared`, results_no_transform$`Mean of Residuals`, results_no_transform$`Standard Deviation of Residuals`),Fishing_Log =c(results_fishing_log$`Mean Absolute Error`, results_fishing_log$`Root Mean Squared Error`, results_fishing_log$`Mean Absolute Percentage Error`, results_fishing_log$`Median Absolute Error`, results_fishing_log$`R-Squared`, results_fishing_log$`Adjusted R-Squared`, results_fishing_log$`Mean of Residuals`, results_fishing_log$`Standard Deviation of Residuals`),Presence_Log =c(results_original$`Mean Absolute Error`, results_original$`Root Mean Squared Error`, results_original$`Mean Absolute Percentage Error`, results_original$`Median Absolute Error`, results_original$`R-Squared`, results_original$`Adjusted R-Squared`, results_original$`Mean of Residuals`, results_original$`Standard Deviation of Residuals`),Both_Log =c(results_log$`Mean Absolute Error`, results_log$`Root Mean Squared Error`, results_log$`Mean Absolute Percentage Error`, results_log$`Median Absolute Error`, results_log$`R-Squared`, results_log$`Adjusted R-Squared`, results_log$`Mean of Residuals`, results_log$`Standard Deviation of Residuals`))# Create the kablekable(results_df, format ="html", digits =3,col.names =c("Metric", "No Transform", "Fishing Hours Log", "Presence Score Log", "Both Log"),caption ="Model Performance Comparison") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE) %>%add_header_above(c(" "=1, "Models"=4)) %>%column_spec(1, bold =TRUE)
Model Performance Comparison
Models
Metric
No Transform
Fishing Hours Log
Presence Score Log
Both Log
Mean Absolute Error
174.631
206.665
175.244
206.571
Root Mean Squared Error
809.108
1268.665
816.945
1271.105
Mean Absolute Percentage Error
1244.644
69.713
1245.517
69.704
Median Absolute Error
22.474
10.185
22.599
10.206
R-Squared
0.812
0.824
0.808
0.824
Adjusted R-Squared
0.812
0.824
0.808
0.824
Mean of Residuals
-4.300
169.775
-4.313
169.508
Standard Deviation of Residuals
809.099
1257.258
816.936
1259.756
Interpretation of model comparison metrics
Based on the provided performance metrics, I would choose the Fishing Hours Log-Transformed Model. Here’s the reasoning:
R-Squared and Adjusted R-Squared: The Fishing Hours Log model has the highest R-squared (0.8239) and Adjusted R-squared values, indicating it explains the most variance in the data.
Mean Absolute Percentage Error (MAPE): This model has a significantly lower MAPE (69.71%) compared to the No Transform and Presence Log models (both over 1200%). This suggests much better relative accuracy. Median Absolute Error: It has the lowest median absolute error (10.19), which indicates good performance on typical cases.
Root Mean Squared Error (RMSE): While higher than the No Transform model, the difference isn’t as dramatic as the improvement in MAPE.
Mean Absolute Error (MAE): Although higher than No Transform and Presence Log models, this should be considered in context with other metrics.
The Both Log model performs very similarly to the Fishing Hours Log model, but the latter edges it out slightly in most metrics.
The No Transform and Presence Log models, despite having lower MAE and RMSE, have extremely high MAPE values, suggesting they might be making large relative errors, especially on smaller values.
The logarithmic transformation of fishing hours seems to have addressed some issues with the data distribution, leading to more balanced performance across different scales of the target variable.
In conclusion, the Fishing Hours Log-Transformed Model appears to offer the best overall performance, particularly in terms of explained variance and relative error metrics. However, the choice might depend on the specific requirements of your application, such as whether absolute or relative errors are more important in your context.
Selected Model performance
Code
evaluate_model <-function(model, data, log_target =FALSE) { predictions <-predict(model, newdata = data)if (log_target) { predictions <-10^predictions -1 } actual <- data$total_fishing_hours# Basic Error Metrics mae <-mean(abs(actual - predictions), na.rm =TRUE) rmse <-sqrt(mean((actual - predictions)^2, na.rm =TRUE)) mape <-mean(abs((actual - predictions) / actual) *100, na.rm =TRUE) medae <-median(abs(actual - predictions), na.rm =TRUE)# R-squared (matching randomForest's % Var explained) r_squared <- model$rsq[length(model$rsq)]# Adjusted R-squared n <-length(actual) p <-length(model$forest$independent.variable.names) # Number of predictors adj_r_squared <-1- ((1- r_squared) * (n -1) / (n - p -1))# Residual Analysis residuals <- actual - predictions mean_residual <-mean(residuals, na.rm =TRUE) sd_residual <-sd(residuals, na.rm =TRUE)# Feature Importance (for Random Forest) feature_importance <-importance(model)return(list("Mean Absolute Error"= mae,"Root Mean Squared Error"= rmse,"Mean Absolute Percentage Error"= mape,"Median Absolute Error"= medae,"R-Squared"= r_squared,"Adjusted R-Squared"= adj_r_squared,"Mean of Residuals"= mean_residual,"Standard Deviation of Residuals"= sd_residual,"Feature Importance"= feature_importance ))}validation_data <- combined_data_with_rasters %>%mutate(data_category =case_when( has_AIS & has_SAR ~"Both AIS and SAR", has_AIS &!has_SAR ~"Only AIS",!has_AIS & has_SAR ~"Only SAR",TRUE~"No fishing detected" ) )# Evaluate all modelsvalidation_data <- validation_data %>%filter(data_category =="Both AIS and SAR")results_rf_env <-evaluate_model(rf_model_env, validation_data)# Create a dataframe for the tableresults_table <-data.frame(Metric =c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error","Median Absolute Error", "R-Squared", "Adjusted R-Squared","Mean of Residuals", "Standard Deviation of Residuals"),Value =round(c(results_rf_env$`Mean Absolute Error`, results_rf_env$`Root Mean Squared Error`, results_rf_env$`Mean Absolute Percentage Error`, results_rf_env$`Median Absolute Error`, results_rf_env$`R-Squared`, results_rf_env$`Adjusted R-Squared`, results_rf_env$`Mean of Residuals`, results_rf_env$`Standard Deviation of Residuals`),2) # Round to 2 decimal places)# Create and display the tablekable(results_table, format ="html", digits =4, caption ="Model Evaluation Metrics") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center")
Model Evaluation Metrics
Metric
Value
Mean Absolute Error
174.91
Root Mean Squared Error
810.11
Mean Absolute Percentage Error
1250.65
Median Absolute Error
22.69
R-Squared
0.81
Adjusted R-Squared
0.81
Mean of Residuals
-3.60
Standard Deviation of Residuals
810.11
Code
# For feature importance, create a separate tablefeature_importance <-as.data.frame(results_rf_env$`Feature Importance`)feature_importance$Feature <-rownames(feature_importance)feature_importance <- feature_importance[, c("Feature", "%IncMSE", "IncNodePurity")]colnames(feature_importance) <-c("Feature", "%IncMSE", "IncNodePurity")# Sort the feature importance table by %IncMSE in descending orderfeature_importance <- feature_importance[order(-feature_importance$`%IncMSE`), ]kable(feature_importance, format ="html", digits =4, col.names =c("Feature", "%IncMSE", "IncNodePurity"),caption ="Feature Importance") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2:3, width ="150px")
Feature Importance
Feature
%IncMSE
IncNodePurity
total_presence_score
total_presence_score
127.1595
737080845352
lon_std
lon_std
80.5654
557956385485
dist_ports
dist_ports
55.9307
312138319465
bathy
bathy
48.4341
238192589722
lat_std
lat_std
48.0964
540028288659
dist_shore
dist_shore
46.3306
213555877780
Maps of predictions
Code
# Prepare the prediction dataprediction_data <- combined_data_with_rasters %>% dplyr::select(total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy)# Make predictionspredictions <-predict(rf_model_env, newdata = prediction_data)# Add predictions to the original datasetcombined_data_01deg <- combined_data_01deg %>%mutate(predicted_fishing_hours =case_when( has_AIS ~ total_fishing_hours, has_SAR ~ predictions[match(paste(lon_std, lat_std), paste(prediction_data$lon_std, prediction_data$lat_std))],TRUE~0 ) )# Create the world mapworld_map <-map_data("world")# Map of predicted fishing hours only # Prepare the data for the mapmap_data <- combined_data_01deg %>%filter(!has_AIS & has_SAR) %>% dplyr::select(lon_std, lat_std, predicted_fishing_hours)#predicted_SAR_only_1RF=map_data#save(predicted_SAR_only_1RF, file="predicted_SAR_only_1RF.Rdata")# Function to create map for a specific regioncreate_region_map <-function(data, world_map, lon_col, lat_col, lon_range, lat_range, title, subtitle) {ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="darkgray", fill ="lightgray", size =0.1) +geom_tile(data = data, aes(x = .data[[lon_col]], y = .data[[lat_col]], fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="Predicted fishing hours (2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3, xlim = lon_range, ylim = lat_range) +theme_minimal() +labs(title = title,subtitle = subtitle,x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )}# Global mappredicted_SAR_only_plot <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-180, 180), c(-90, 90), "Predicted Fishing Hours in Areas with Only SAR Detections", "0.1 degree resolution")# Caribbean mapcaribbean_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Predicted Fishing Hours in the Caribbean", "0.1 degree resolution")# Northwestern Indian Ocean to Western European waters mapindian_european_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.1 degree resolution")# Asia mapasia_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Predicted Fishing Hours in Asia", "0.1 degree resolution")# Print the mapsprint(predicted_SAR_only_plot)
Code
print(caribbean_map)
Code
print(indian_european_map)
Code
print(asia_map)
Code
#Map of both original and predicted AIS fishing hours # Visualize the resultspredicted_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data_01deg, aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Hours (0.1 degree resolution)",subtitle ="Based on AIS data and Random Forest predictions from SAR data",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )print(predicted_plot)
Code
#Aggregate data to 0.5 degree resolution # First, round the coordinates to the nearest 0.5 degreecombined_data_05deg <- combined_data_01deg %>%mutate(lon_05deg =round(lon_std *2) /2,lat_05deg =round(lat_std *2) /2 )# Aggregate the dataaggregated_data <- combined_data_05deg %>%group_by(lon_05deg, lat_05deg) %>%summarise(predicted_fishing_hours =sum(predicted_fishing_hours, na.rm =TRUE),.groups ='drop' )#save(aggregated_data, file=here::here("Predicted_Fishing_Hours_05Deg.Rdata"))# Create the mappredicted_plot_05deg <-ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = aggregated_data, aes(x = lon_05deg, y = lat_05deg, fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Hours (0.5 degree resolution)",subtitle ="Based on AIS data and Random Forest predictions from SAR data",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )# Print the mapprint(predicted_plot_05deg)
Code
# Caribbean mapcaribbean_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(-100, -50), c(0, 40), "Fishing Hours in the Caribbean", "0.5 degree resolution")# Northwestern Indian Ocean to Western European waters mapindian_european_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(-20, 80), c(0, 70), "Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.5 degree resolution")# Asia mapasia_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(60, 180), c(-20, 60), "Fishing Hours in Asia", "0.5 degree resolution")# Print the mapsprint(caribbean_map)
Code
print(indian_european_map)
Code
print(asia_map)
Test for spatial auto-correlation
Code
##------------ Create a non-random test set # Prepare the datadata <- combined_data_with_rasters %>%filter(has_AIS & has_SAR) %>% dplyr::select(total_fishing_hours, total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%na.omit()# Split the data into training and test setsset.seed(123) # for reproducibilitytrain_indices <-sample(1:nrow(data), 0.7*nrow(data))train_data <- data[train_indices, ]test_data <- data[-train_indices, ]# Train the random forest model#set.seed(123) # for reproducibility#rf_timing <- system.time({# rf_model_env <- randomForest(# total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = train_data,# ntree = 500,# importance = TRUE# )#})#cat("Random Forest model training time:", rf_timing["elapsed"], "seconds\n")# Save the new model#rf_model_env_train=rf_model_env#save(rf_model_env_train, file = "rf_model_01deg_with_rasters_train.Rdata")load(here::here("rf_model_01deg_with_rasters_train.Rdata"))# Make predictions on the test settest_data$predictions <-predict(rf_model_env_train, test_data)# Calculate residualstest_data$residuals <- test_data$total_fishing_hours - test_data$predictions# Prepare spatial data for autocorrelation analysistest_data_sf <-st_as_sf(test_data, coords =c("lon_std", "lat_std"), crs =4326)# Create a neighbors listk <-8# You can adjust this numbernb <-knearneigh(st_coordinates(test_data_sf), k = k)nb <-knn2nb(nb)# Create a spatial weights matrixlw <-nb2listw(nb, style="W")# Perform Moran's I test on the test set residualsmoran_test <-moran.test(test_data_sf$residuals, lw)# Create a data frame with the test results moran_results <-data.frame(Statistic =c("Moran I statistic", "Expectation", "Variance", "Standard deviate", "P-value"),Value =c(round(moran_test$estimate[1],2),round(moran_test$estimate[2],2),round(moran_test$estimate[3],2),round(moran_test$statistic,2),ifelse(moran_test$p.value <2.2e-16, "< 2.2e-16", format(moran_test$p.value, scientific =TRUE, digits =4)) ) )# Create and display the tablekable(moran_results, format ="html", col.names =c("Statistic", "Value"),caption ="Moran I Test under randomisation") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE,position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2, width ="15em")
Moran I Test under randomisation
Statistic
Value
Moran I statistic
Moran I statistic
0.25
Expectation
Expectation
0
Variance
Variance
0
Moran I statistic standard deviate
Standard deviate
115.35
P-value
< 2.2e-16
Spatial cross-validation
Code
load(here::here("training_data.Rdata"))# Assuming your training_data has lon_std (longitude) and lat_std (latitude) columnscoords <-SpatialPoints(training_data[, c("lon_std", "lat_std")])# Combine your training data with spatial coordinatesspatial_data <-SpatialPointsDataFrame(coords, data = training_data)#Testing to split the coordinates with k-means clustering# Ensure spatial_data is an sf objectif (!inherits(spatial_data, "sf")) { spatial_data_sf <-st_as_sf(spatial_data)} else { spatial_data_sf <- spatial_data}# Extract coordinatescoords <-st_coordinates(spatial_data_sf)# Perform k-means clusteringset.seed(123) # for reproducibilitykmeans_result <-kmeans(coords, centers =6)# Add cluster assignments to the sf objectspatial_data_sf$block <-as.factor(kmeans_result$cluster)# Create the plotggplot() +geom_sf(data = spatial_data_sf, aes(color = block), size =1) +scale_color_viridis_d(name ="Block") +theme_minimal() +labs(title ="Spatial Blocks for 6-Fold Cross-Validation",subtitle ="Points colored by block assignment") +theme(legend.position ="bottom")
Code
# Register cores for parallel processing (use 1 less than your total cores)#cl <- makeCluster(detectCores() - 1)#registerDoParallel(cl)# Initialize a list to store results#results <- foreach(i = 1:6, .packages = c("randomForest", "dplyr"), .export = "spatial_data_sf") %dopar% {# Get the indices of the training and test sets for the i-th block# test_indices <- spatial_data_sf$block == i# train_data <- spatial_data_sf[!test_indices, ]# test_data <- spatial_data_sf[test_indices, ]# Train Random Forest on the training data# rf_model <- randomForest(# total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = train_data,# ntree = 500,# importance = TRUE# )# Save the model# model_filename <- paste0("rf_model_fold_", i, ".rds")# saveRDS(rf_model, file = model_filename)# Predict on the test data# predictions <- predict(rf_model, test_data)# Return a list with results and model filename# list(# results = data.frame(observed = test_data$total_fishing_hours, predicted = predictions, fold = i),# model_filename = model_filename# )#}# Stop the cluster once done#stopCluster(cl)# Extract results and model filenames#results_df <- do.call(rbind, lapply(results, function(x) x$results))#model_filenames <- sapply(results, function(x) x$model_filename)# Save results#save(results_df, file = "spatial_cv_results.RData")#save(model_filenames, file = "model_filenames.RData")#load(here::here("spatial_cv_results.RData"))#load(here::here("model_filenames.RData"))# Ensure spatial_data and results_df are aligned#spatial_data <- spatial_data[order(row.names(spatial_data)), ]#results_df <- results_df[order(row.names(results_df)), ]# Get all coordinates#all_coordinates <- coordinates(spatial_data)# Create neighbor list for the entire dataset#k <- 30 # Number of nearest neighbors, adjust as needed#nb_all <- knn2nb(knearneigh(all_coordinates, k = k))# Create spatial weights for the entire dataset#lw_all <- nb2listw(nb_all, style="W", zero.policy = TRUE)# Function to calculate Moran's I#calculate_morans_i <- function(residuals, indices, lw_all) {# Create a logical vector for subsetting# subset_vector <- rep(FALSE, length(lw_all$neighbours))# subset_vector[indices] <- TRUE# Subset the weights list for the current fold# lw_subset <- subset.listw(lw_all, subset_vector, zero.policy = TRUE)# Calculate Moran's I# moran_result <- moran.test(residuals, lw_subset, zero.policy = TRUE)# return(moran_result)#}# Calculate Moran's I for each fold in the spatial cross-validation#morans_i_results <- list()#for (i in 1:6) {# Get the indices for this fold# fold_indices <- which(results_df$fold == i)# Calculate residuals# fold_residuals <- results_df$observed[fold_indices] - results_df$predicted[fold_indices]# Calculate Moran's I for this fold# morans_i_results[[i]] <- calculate_morans_i(fold_residuals, fold_indices, lw_all)# Print progress# print(paste("Completed fold", i))#}#save(morans_i_results, file="morans_i_results.Rdata")load(here::here("morans_i_results.Rdata"))# Print results#print("Moran's I for each fold in spatial cross-validation:")#for (i in 1:5) {# print(paste("Fold", i))# print(morans_i_results[[i]])#}# Extract relevant information from Moran's I resultsmorans_table <-data.frame(Fold =1:6,Moran_I =sapply(morans_i_results, function(x) x$estimate[1]),Expectation =sapply(morans_i_results, function(x) x$estimate[2]),Variance =sapply(morans_i_results, function(x) x$estimate[3]),Std_Deviate =sapply(morans_i_results, function(x) x$statistic),P_Value =sapply(morans_i_results, function(x) x$p.value))# Format the tablemorans_table_formatted <- morans_table %>%mutate(Moran_I =round(Moran_I, 4),Expectation =format(Expectation, scientific =TRUE, digits =4),Variance =format(Variance, scientific =TRUE, digits =4),Std_Deviate =round(Std_Deviate, 2),P_Value =ifelse(P_Value <2.2e-16, "< 2.2e-16", format(P_Value, scientific =TRUE, digits =4)) )# Create and print the table using kable and kable_stylingkable(morans_table_formatted,col.names =c("Fold", "Moran's I", "Expectation", "Variance", "Std. Deviate", "p-value"),caption ="Moran's I Results for Each Fold in Spatial Cross-Validation",align =c('c', 'r', 'r', 'r', 'r', 'r'),format ="html") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2:6, width ="150px")
Moran's I Results for Each Fold in Spatial Cross-Validation
Fold
Moran's I
Expectation
Variance
Std. Deviate
p-value
1
0.4423
-7.330e-05
4.410e-06
210.64
< 2.2e-16
2
0.4504
-1.275e-05
8.039e-07
502.39
< 2.2e-16
3
0.3825
-7.548e-05
4.594e-06
178.51
< 2.2e-16
4
0.6267
-3.090e-05
1.907e-06
453.87
< 2.2e-16
5
0.3767
-9.321e-05
5.730e-06
157.42
< 2.2e-16
6
0.4118
-6.942e-05
4.234e-06
200.15
< 2.2e-16
Spatially blocked predictions
Code
# Load necessary librarieslibrary(raster)library(ggplot2)library(viridis)library(dplyr)library(data.table)# Load the model filenames#load(here::here("model_filenames.RData"))# Function to make predictions using all 6 models#make_predictions <- function(data) {# predictions <- matrix(0, nrow = nrow(data), ncol = 6)# for (i in 1:6) {# model <- readRDS(here::here(model_filenames[i]))# predictions[, i] <- predict(model, data)# }# Take the average prediction across all models# rowMeans(predictions)#}# Prepare the prediction data (same as before)#prediction_data <- combined_data_with_rasters %>%# dplyr::select(total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy)# Make predictions using the spatially blocked models#new_predictions <- make_predictions(prediction_data)#save(new_predictions, file=here::here("new_predictions.Rdata"))load(here::here("new_predictions.Rdata"))# Add predictions to the original datasetcombined_data_01deg <- combined_data_01deg %>%mutate(predicted_fishing_hours =case_when( has_AIS ~ total_fishing_hours, has_SAR ~ new_predictions[match(paste(lon_std, lat_std), paste(prediction_data$lon_std, prediction_data$lat_std))],TRUE~0 ) )# Now you can use the same mapping code as before, but it will use the new predictions# Map of predicted fishing hours only map_data <- combined_data_01deg %>%filter(!has_AIS & has_SAR) %>% dplyr::select(lon_std, lat_std, predicted_fishing_hours)#predicted_SAR_only_6RF=map_data#save(predicted_SAR_only_6RF, file="predicted_SAR_only_6RF.Rdata")# Use your create_region_map function to create the maps# Global mappredicted_SAR_only_plot <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-180, 180), c(-90, 90), "Predicted Fishing Hours in Areas with Only SAR Detections", "0.1 degree resolution (Spatially Blocked Model)")# Caribbean mapcaribbean_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Predicted Fishing Hours in the Caribbean", "0.1 degree resolution (Spatially Blocked Model)")# Northwestern Indian Ocean to Western European waters mapindian_european_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.1 degree resolution (Spatially Blocked Model)")# Asia mapasia_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Predicted Fishing Hours in Asia", "0.1 degree resolution (Spatially Blocked Model)")# Print the mapsprint(predicted_SAR_only_plot)
Code
print(caribbean_map)
Code
print(indian_european_map)
Code
print(asia_map)
Code
# Create the global map of both original and predicted fishing hourspredicted_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data_01deg, aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Hours (0.1 degree resolution)",subtitle ="Based on AIS data and Spatially Blocked Random Forest predictions from SAR data",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )print(predicted_plot)
Code
# Aggregate data to 0.5 degree resolutioncombined_data_05deg <- combined_data_01deg %>%mutate(lon_05deg =round(lon_std *2) /2,lat_05deg =round(lat_std *2) /2 ) %>%group_by(lon_05deg, lat_05deg) %>%summarize(predicted_fishing_hours =sum(predicted_fishing_hours, na.rm =TRUE) ) %>%ungroup()# Global map (0.5 degree)global_map_05deg <-create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", c(-180, 180), c(-90, 90), "Global Fishing Hours", "0.5 degree resolution (Observed and Predicted)")# Caribbean map (0.5 degree)caribbean_map_05deg <-create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", c(-100, -50), c(0, 40), "Fishing Hours in the Caribbean", "0.5 degree resolution (Observed and Predicted)")# Northwestern Indian Ocean to Western European waters map (0.5 degree)indian_european_map_05deg <-create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", c(-20, 80), c(0, 70), "Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.5 degree resolution (Observed and Predicted)")# Asia map (0.5 degree)asia_map_05deg <-create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", c(60, 180), c(-20, 60), "Fishing Hours in Asia", "0.5 degree resolution (Observed and Predicted)")# Print the mapsprint(global_map_05deg)
Code
print(caribbean_map_05deg)
Code
print(indian_european_map_05deg)
Code
print(asia_map_05deg)
Differences among predictions
Code
# Load the dataload("predicted_SAR_only_6RF.Rdata")load("predicted_SAR_only_1RF.Rdata")# Prepare the data (assuming you've already loaded and arranged it)combined_data <-bind_rows( predicted_SAR_only_6RF %>% dplyr::select(predicted_fishing_hours) %>%mutate(method ="6RF"), predicted_SAR_only_1RF %>% dplyr::select(predicted_fishing_hours) %>%mutate(method ="1RF"))# 1. Log-transformed density plotlog_density_plot <-ggplot(combined_data, aes(x = predicted_fishing_hours +1, fill = method)) +geom_density(alpha =0.5) +scale_x_log10(labels = scales::comma) +scale_fill_manual(values =c("6RF"="blue", "1RF"="red")) +labs(title ="Distribution of Predicted Fishing Hours (Log Scale)",x ="Predicted Fishing Hours (Log Scale)",y ="Density",fill ="Method") +theme_minimal() +theme(legend.position ="bottom")print(log_density_plot)
---title: "Global_fishing_watch_workflow_envpreds"author: "Théophile L. Mouton"date: "`r Sys.Date()`"format: html: toc: true toc-location: right css: custom.css output-file: "Global_Fishing_Watch_workflow_envpreds.html" self-contained: true code-fold: true code-tools: trueeditor: visualexecute: warning: falseparams: printlabels: TRUE---# Combining AIS and Sentinel-1 SAR fishing effort data from Global Fishing Watch## Open R libraries```{r}# Load required librarieslibrary(tidyverse)library(tidyr)library(ggplot2)library(data.table)library(gridExtra)library(maps)library(raster)library(sf)library(viridis)library(scales)library(dplyr)library(randomForest)library(caret)library(pdp)library(knitr)library(kableExtra)library(future)library(spdep)library(ncf)library(blockCV)library(parallel)library(doParallel)library(sp)```## AIS dataData from Kroodsma et al. (2018) Science, accessible at: [Global Fishing Watch Data Download Portal](https://globalfishingwatch.org/data-download/datasets/public-fishing-effort).The data is accessible up to the end of the year 2020, we used four entire years of data (2017, 2018, 2019, 2020) to match SAR data records.```{r}# Set the path to the 2017-2020 folder#path <- "Data/AIS Fishing Effort 2017-2020"# List all CSV files in the folder#AIS_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE, recursive = TRUE)# Read all CSV files and combine them into a single data frame#AIS_fishing <- AIS_csv_files %>%# map_df(~read_csv(.))load(here::here("Data","AIS_fishing.Rdata"))# Aggregate fishing hours by latitude and longitudeaggregated_AIS_fishing <- AIS_fishing %>%group_by(cell_ll_lat, cell_ll_lon) %>%summarise(total_fishing_hours =sum(fishing_hours, na.rm =TRUE)) %>%ungroup() %>%# Remove any cells with zero or negative fishing hoursfilter(total_fishing_hours >0)# Function to standardize coordinates to 0.1 degree resolutionstandardize_coords <-function(lon, lat) {list(lon_std =floor(lon *10) /10,lat_std =floor(lat *10) /10 )}# Standardize and aggregate AIS dataAIS_data_std <- aggregated_AIS_fishing %>%mutate(coords =map2(cell_ll_lon, cell_ll_lat, standardize_coords)) %>%mutate(lon_std =map_dbl(coords, ~ .x$lon_std),lat_std =map_dbl(coords, ~ .x$lat_std) ) %>%group_by(lon_std, lat_std) %>%summarise(total_fishing_hours =sum(total_fishing_hours, na.rm =TRUE), .groups ="drop")# Create the world mapworld_map <-map_data("world")# Create the plotggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = AIS_data_std, aes(x = lon_std, y = lat_std, fill = total_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title =NULL,x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )```## Sentinel-1 SAR dataData from Paolo et al. 2024 Nature, accessible at: [Global Fishing Watch SAR Vessel Detections](https://globalfishingwatch.org/data-download/datasets/public-sar-vessel-detections:v20231026)The data is accessible from 2017 to today, we used four entire years of data (2017, 2018, 2019, 2020) to match AIS records.```{r}# Set the path to the 2016 folder#path <- "Data/SAR Vessel detections 2017-2020"# List all CSV files in the folder#SAR_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE)# Read all CSV files and combine them into a single data frame#SAR_fishing <- SAR_csv_files %>%# map_df(~read_csv(.))load(here::here("Data","SAR_fishing.Rdata"))# Aggregate fishing hours by latitude and longitudeaggregated_SAR_fishing <- SAR_fishing %>%mutate(lat_rounded =round(lat, digits =2),lon_rounded =round(lon, digits =2) ) %>%group_by(lat_rounded, lon_rounded) %>%filter(fishing_score >=0.9) %>%summarise(total_presence_score =sum(presence_score, na.rm =TRUE),avg_fishing_score =mean(fishing_score, na.rm =TRUE),count =n() ) %>%mutate(total_presence_score =round(total_presence_score, digits =0)) %>%ungroup()# Standardize and aggregate SAR dataSAR_data_std <- aggregated_SAR_fishing %>%mutate(coords =map2(lon_rounded, lat_rounded, standardize_coords)) %>%mutate(lon_std =map_dbl(coords, ~ .x$lon_std),lat_std =map_dbl(coords, ~ .x$lat_std) ) %>%group_by(lon_std, lat_std) %>%summarise(total_presence_score =sum(total_presence_score, na.rm =TRUE), .groups ="drop")# Create the world mapworld_map <-map_data("world")# Create the plotggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = SAR_data_std, aes(x = lon_std, y = lat_std, fill = total_presence_score)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="Fishing vessel detections (2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title =NULL,x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )```## Compare AIS, and SAR detection locationsIdentifying grid cells with only AIS, only SAR detections or both data types.```{r}# Merge the datasetscombined_data <-full_join( AIS_data_std, SAR_data_std,by =c("lon_std", "lat_std"))# Categorize each cellcombined_data <- combined_data %>%mutate(category =case_when( total_fishing_hours >0& total_presence_score >0~"Both AIS and SAR", total_fishing_hours >0& (is.na(total_presence_score) | total_presence_score ==0) ~"Only AIS", (is.na(total_fishing_hours) | total_fishing_hours ==0) & total_presence_score >0~"Only SAR",TRUE~"No fishing detected" ))# Create the world mapworld_map <-map_data("world")# Create the plotworld_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data, aes(x = lon_std, y = lat_std, fill = category)) +scale_fill_manual(values =c("Both AIS and SAR"="purple", "Only AIS"="blue", "Only SAR"="red", "No fishing detected"="white"),name ="Fishing Data Source",guide =guide_legend(title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Detection",subtitle ="Comparison of AIS (2017-2020) and SAR (2017-2020) data at 0.1-degree resolution",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )# Display the plotprint(world_plot)# Get summary statisticssummary_stats <- combined_data %>%count(category) %>%mutate(percentage = n /sum(n) *100) %>%rename(`Number of cells`= n) %>%mutate(percentage =round(percentage, 2))# Create and display the tablekable(summary_stats, format ="html", col.names =c("Category", "Number of cells", "Percentage (%)"),caption ="Summary Statistics of Data Categories") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(2, width ="150px") %>%column_spec(3, width ="150px")```## Random forest modelPredicting fishing hours in areas with only SAR data at a 0.1 degree resolution. Using a random forest model of \>160,000 associations between SAR vessel detections and AIS fishing hours globally, geographical coordinates, distance to ports, distance to shore and bathymetry.```{r}# Open the saved adjusted rastersPorts_adjusted <-raster(here::here("Data", "Environmental data layers", "distance-from-port-0.1deg-adjusted.tif"))Shore_adjusted <-raster(here::here("Data", "Environmental data layers", "distance-from-shore-0.1deg-adjusted.tif"))Bathy_adjusted <-raster(here::here("Data", "Environmental data layers", "bathymetry-0.1deg-adjusted.tif"))# Stack the resampled rastersraster_stack <-stack(Shore_adjusted, Ports_adjusted, Bathy_adjusted)# Convert the stack to a dataframeraster_df <-as.data.frame(raster_stack, xy =TRUE)# Rename the columnsnames(raster_df) <-c("x", "y", "dist_shore", "dist_ports", "bathy")# Remove NA values if desiredraster_df <-na.omit(raster_df)# Convert to data.table for efficiencysetDT(raster_df)# Round x and y to 1 decimal place for consistencyraster_df[, `:=`(lon_std =round(x, digits =1),lat_std =round(y, digits =1))]# Now proceed with the join and model trainingload(here::here("data","combined_data_O1deg.Rdata"))setDT(combined_data_01deg)# Perform the join using data.tablecombined_data_with_rasters <- raster_df[combined_data_01deg, on = .(lon_std, lat_std), nomatch =0]# Convert back to dataframe if neededcombined_data_with_rasters <-as.data.frame(combined_data_with_rasters)# Prepare the training datatraining_data <- combined_data_with_rasters %>%filter(has_AIS & has_SAR) %>% dplyr::select(total_fishing_hours, total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%na.omit()#save(training_data, file=here::here("training_data.Rdata"))# Train the random forest model with timing#set.seed(123) # for reproducibility#rf_timing <- system.time({# rf_model_env <- randomForest(# total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = training_data,# ntree = 500,# importance = TRUE# )#})#cat("Random Forest model training time:", rf_timing["elapsed"], "seconds\n")# Save the new model#save(rf_model_env, file = "rf_model_01deg_with_rasters.Rdata")load(here::here("rf_model_01deg_with_rasters.Rdata"))```### Comparison of transformations in models ```{r}# Prepare the dataload(here::here("training_data.Rdata"))#training_data_log <- training_data %>%# mutate(# log_total_presence_score = log10(total_presence_score + 1),# log_total_fishing_hours = log10(total_fishing_hours + 1)# )# Function to run a single model#run_model <- function(formula, data) {# randomForest(# formula,# data = data,# ntree = 500,# importance = TRUE# )#}# Set up parallel processing#num_cores <- detectCores() - 1 # Use all but one core#cl <- makeCluster(num_cores)# Export necessary objects to the cluster#clusterExport(cl, c("training_data_log", "run_model"))# Load required packages on each cluster#clusterEvalQ(cl, library(randomForest))# Define the models#models <- list(# no_transform = as.formula(total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + #dist_ports + bathy),# original = as.formula(total_fishing_hours ~ log_total_presence_score + lon_std + lat_std + dist_shore + #dist_ports + bathy),# log = as.formula(log_total_fishing_hours ~ log_total_presence_score + lon_std + lat_std + dist_shore + #dist_ports + bathy)#)# Run models in parallel#results <- parLapply(cl, models, function(formula) run_model(formula, training_data_log))# Stop the cluster#stopCluster(cl)# Save the models#rf_model_no_transform <- results[[1]]#rf_model_original <- results[[2]]#rf_model_log <- results[[3]]# Save models to files#saveRDS(rf_model_no_transform, "rf_model_no_transform.rds")#saveRDS(rf_model_original, "rf_model_original.rds")#saveRDS(rf_model_log, "rf_model_log.rds")# Add the new model with only total_fishing_hours log-transformed#rf_model_fishing_log <- randomForest(# log_total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = training_data_log,# ntree = 500,# importance = TRUE#)# Save the new model#saveRDS(rf_model_fishing_log, "rf_model_fishing_log.rds")# Load the saved modelsrf_model_no_transform <-readRDS(here::here("rf_model_no_transform.rds"))rf_model_original <-readRDS(here::here("rf_model_original.rds"))rf_model_log <-readRDS(here::here("rf_model_log.rds"))rf_model_fishing_log <-readRDS(here::here("rf_model_fishing_log.rds"))# Function to evaluate modelsevaluate_model <-function(model, data, log_target =FALSE) { predictions <-predict(model, newdata = data)if (log_target) { predictions <-10^predictions -1 } actual <- data$total_fishing_hours# Basic Error Metrics mae <-mean(abs(actual - predictions), na.rm =TRUE) rmse <-sqrt(mean((actual - predictions)^2, na.rm =TRUE)) mape <-mean(abs((actual - predictions) / actual) *100, na.rm =TRUE) medae <-median(abs(actual - predictions), na.rm =TRUE)# R-squared (matching randomForest's % Var explained) r_squared <- model$rsq[length(model$rsq)]# Adjusted R-squared n <-length(actual) p <-length(model$forest$independent.variable.names) # Number of predictors adj_r_squared <-1- ((1- r_squared) * (n -1) / (n - p -1))# Residual Analysis residuals <- actual - predictions mean_residual <-mean(residuals, na.rm =TRUE) sd_residual <-sd(residuals, na.rm =TRUE)# Feature Importance (for Random Forest) feature_importance <-importance(model)return(list("Mean Absolute Error"= mae,"Root Mean Squared Error"= rmse,"Mean Absolute Percentage Error"= mape,"Median Absolute Error"= medae,"R-Squared"= r_squared,"Adjusted R-Squared"= adj_r_squared,"Mean of Residuals"= mean_residual,"Standard Deviation of Residuals"= sd_residual,"Feature Importance"= feature_importance ))}# Evaluate all modelsvalidation_data <- combined_data_with_rasters %>%mutate(data_category =case_when( has_AIS & has_SAR ~"Both AIS and SAR", has_AIS &!has_SAR ~"Only AIS",!has_AIS & has_SAR ~"Only SAR",TRUE~"No fishing detected" ) )validation_data <- validation_data %>%filter(data_category =="Both AIS and SAR")# Evaluate all modelsresults_no_transform <-evaluate_model(rf_model_no_transform, validation_data)validation_data_logpres <- validation_data %>%mutate(log_total_presence_score =log10(total_presence_score +1))results_original <-evaluate_model(rf_model_original, validation_data_logpres)# Add evaluation for the new model (fishing hours log-transformed)results_fishing_log <-evaluate_model(rf_model_fishing_log, validation_data, log_target =TRUE)results_log <-evaluate_model(rf_model_log, validation_data_logpres, log_target =TRUE)# Create a data frame with the resultsresults_df <-data.frame(Metric =c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error", "Median Absolute Error", "R-Squared", "Adjusted R-Squared", "Mean of Residuals", "Standard Deviation of Residuals"),No_Transform =c(results_no_transform$`Mean Absolute Error`, results_no_transform$`Root Mean Squared Error`, results_no_transform$`Mean Absolute Percentage Error`, results_no_transform$`Median Absolute Error`, results_no_transform$`R-Squared`, results_no_transform$`Adjusted R-Squared`, results_no_transform$`Mean of Residuals`, results_no_transform$`Standard Deviation of Residuals`),Fishing_Log =c(results_fishing_log$`Mean Absolute Error`, results_fishing_log$`Root Mean Squared Error`, results_fishing_log$`Mean Absolute Percentage Error`, results_fishing_log$`Median Absolute Error`, results_fishing_log$`R-Squared`, results_fishing_log$`Adjusted R-Squared`, results_fishing_log$`Mean of Residuals`, results_fishing_log$`Standard Deviation of Residuals`),Presence_Log =c(results_original$`Mean Absolute Error`, results_original$`Root Mean Squared Error`, results_original$`Mean Absolute Percentage Error`, results_original$`Median Absolute Error`, results_original$`R-Squared`, results_original$`Adjusted R-Squared`, results_original$`Mean of Residuals`, results_original$`Standard Deviation of Residuals`),Both_Log =c(results_log$`Mean Absolute Error`, results_log$`Root Mean Squared Error`, results_log$`Mean Absolute Percentage Error`, results_log$`Median Absolute Error`, results_log$`R-Squared`, results_log$`Adjusted R-Squared`, results_log$`Mean of Residuals`, results_log$`Standard Deviation of Residuals`))# Create the kablekable(results_df, format ="html", digits =3,col.names =c("Metric", "No Transform", "Fishing Hours Log", "Presence Score Log", "Both Log"),caption ="Model Performance Comparison") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE) %>%add_header_above(c(" "=1, "Models"=4)) %>%column_spec(1, bold =TRUE)```#### Interpretation of model comparison metrics Based on the provided performance metrics, I would choose the Fishing Hours Log-Transformed Model. Here's the reasoning:1. R-Squared and Adjusted R-Squared: The Fishing Hours Log model has the highest R-squared (0.8239) and Adjusted R-squared values, indicating it explains the most variance in the data.2. Mean Absolute Percentage Error (MAPE): This model has a significantly lower MAPE (69.71%) compared to the No Transform and Presence Log models (both over 1200%). This suggests much better relative accuracy.Median Absolute Error: It has the lowest median absolute error (10.19), which indicates good performance on typical cases.3. Root Mean Squared Error (RMSE): While higher than the No Transform model, the difference isn't as dramatic as the improvement in MAPE.4. Mean Absolute Error (MAE): Although higher than No Transform and Presence Log models, this should be considered in context with other metrics.The Both Log model performs very similarly to the Fishing Hours Log model, but the latter edges it out slightly in most metrics.The No Transform and Presence Log models, despite having lower MAE and RMSE, have extremely high MAPE values, suggesting they might be making large relative errors, especially on smaller values.The logarithmic transformation of fishing hours seems to have addressed some issues with the data distribution, leading to more balanced performance across different scales of the target variable.In conclusion, the Fishing Hours Log-Transformed Model appears to offer the best overall performance, particularly in terms of explained variance and relative error metrics. However, the choice might depend on the specific requirements of your application, such as whether absolute or relative errors are more important in your context.### Selected Model performance```{r}evaluate_model <-function(model, data, log_target =FALSE) { predictions <-predict(model, newdata = data)if (log_target) { predictions <-10^predictions -1 } actual <- data$total_fishing_hours# Basic Error Metrics mae <-mean(abs(actual - predictions), na.rm =TRUE) rmse <-sqrt(mean((actual - predictions)^2, na.rm =TRUE)) mape <-mean(abs((actual - predictions) / actual) *100, na.rm =TRUE) medae <-median(abs(actual - predictions), na.rm =TRUE)# R-squared (matching randomForest's % Var explained) r_squared <- model$rsq[length(model$rsq)]# Adjusted R-squared n <-length(actual) p <-length(model$forest$independent.variable.names) # Number of predictors adj_r_squared <-1- ((1- r_squared) * (n -1) / (n - p -1))# Residual Analysis residuals <- actual - predictions mean_residual <-mean(residuals, na.rm =TRUE) sd_residual <-sd(residuals, na.rm =TRUE)# Feature Importance (for Random Forest) feature_importance <-importance(model)return(list("Mean Absolute Error"= mae,"Root Mean Squared Error"= rmse,"Mean Absolute Percentage Error"= mape,"Median Absolute Error"= medae,"R-Squared"= r_squared,"Adjusted R-Squared"= adj_r_squared,"Mean of Residuals"= mean_residual,"Standard Deviation of Residuals"= sd_residual,"Feature Importance"= feature_importance ))}validation_data <- combined_data_with_rasters %>%mutate(data_category =case_when( has_AIS & has_SAR ~"Both AIS and SAR", has_AIS &!has_SAR ~"Only AIS",!has_AIS & has_SAR ~"Only SAR",TRUE~"No fishing detected" ) )# Evaluate all modelsvalidation_data <- validation_data %>%filter(data_category =="Both AIS and SAR")results_rf_env <-evaluate_model(rf_model_env, validation_data)# Create a dataframe for the tableresults_table <-data.frame(Metric =c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error","Median Absolute Error", "R-Squared", "Adjusted R-Squared","Mean of Residuals", "Standard Deviation of Residuals"),Value =round(c(results_rf_env$`Mean Absolute Error`, results_rf_env$`Root Mean Squared Error`, results_rf_env$`Mean Absolute Percentage Error`, results_rf_env$`Median Absolute Error`, results_rf_env$`R-Squared`, results_rf_env$`Adjusted R-Squared`, results_rf_env$`Mean of Residuals`, results_rf_env$`Standard Deviation of Residuals`),2) # Round to 2 decimal places)# Create and display the tablekable(results_table, format ="html", digits =4, caption ="Model Evaluation Metrics") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center")# For feature importance, create a separate tablefeature_importance <-as.data.frame(results_rf_env$`Feature Importance`)feature_importance$Feature <-rownames(feature_importance)feature_importance <- feature_importance[, c("Feature", "%IncMSE", "IncNodePurity")]colnames(feature_importance) <-c("Feature", "%IncMSE", "IncNodePurity")# Sort the feature importance table by %IncMSE in descending orderfeature_importance <- feature_importance[order(-feature_importance$`%IncMSE`), ]kable(feature_importance, format ="html", digits =4, col.names =c("Feature", "%IncMSE", "IncNodePurity"),caption ="Feature Importance") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2:3, width ="150px")```### Maps of predictions ```{r}# Prepare the prediction dataprediction_data <- combined_data_with_rasters %>% dplyr::select(total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy)# Make predictionspredictions <-predict(rf_model_env, newdata = prediction_data)# Add predictions to the original datasetcombined_data_01deg <- combined_data_01deg %>%mutate(predicted_fishing_hours =case_when( has_AIS ~ total_fishing_hours, has_SAR ~ predictions[match(paste(lon_std, lat_std), paste(prediction_data$lon_std, prediction_data$lat_std))],TRUE~0 ) )# Create the world mapworld_map <-map_data("world")# Map of predicted fishing hours only # Prepare the data for the mapmap_data <- combined_data_01deg %>%filter(!has_AIS & has_SAR) %>% dplyr::select(lon_std, lat_std, predicted_fishing_hours)#predicted_SAR_only_1RF=map_data#save(predicted_SAR_only_1RF, file="predicted_SAR_only_1RF.Rdata")# Function to create map for a specific regioncreate_region_map <-function(data, world_map, lon_col, lat_col, lon_range, lat_range, title, subtitle) {ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="darkgray", fill ="lightgray", size =0.1) +geom_tile(data = data, aes(x = .data[[lon_col]], y = .data[[lat_col]], fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="Predicted fishing hours (2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3, xlim = lon_range, ylim = lat_range) +theme_minimal() +labs(title = title,subtitle = subtitle,x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )}# Global mappredicted_SAR_only_plot <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-180, 180), c(-90, 90), "Predicted Fishing Hours in Areas with Only SAR Detections", "0.1 degree resolution")# Caribbean mapcaribbean_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Predicted Fishing Hours in the Caribbean", "0.1 degree resolution")# Northwestern Indian Ocean to Western European waters mapindian_european_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.1 degree resolution")# Asia mapasia_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Predicted Fishing Hours in Asia", "0.1 degree resolution")# Print the mapsprint(predicted_SAR_only_plot)print(caribbean_map)print(indian_european_map)print(asia_map)#Map of both original and predicted AIS fishing hours # Visualize the resultspredicted_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data_01deg, aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Hours (0.1 degree resolution)",subtitle ="Based on AIS data and Random Forest predictions from SAR data",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )print(predicted_plot)#Aggregate data to 0.5 degree resolution # First, round the coordinates to the nearest 0.5 degreecombined_data_05deg <- combined_data_01deg %>%mutate(lon_05deg =round(lon_std *2) /2,lat_05deg =round(lat_std *2) /2 )# Aggregate the dataaggregated_data <- combined_data_05deg %>%group_by(lon_05deg, lat_05deg) %>%summarise(predicted_fishing_hours =sum(predicted_fishing_hours, na.rm =TRUE),.groups ='drop' )#save(aggregated_data, file=here::here("Predicted_Fishing_Hours_05Deg.Rdata"))# Create the mappredicted_plot_05deg <-ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = aggregated_data, aes(x = lon_05deg, y = lat_05deg, fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Hours (0.5 degree resolution)",subtitle ="Based on AIS data and Random Forest predictions from SAR data",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )# Print the mapprint(predicted_plot_05deg)# Caribbean mapcaribbean_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(-100, -50), c(0, 40), "Fishing Hours in the Caribbean", "0.5 degree resolution")# Northwestern Indian Ocean to Western European waters mapindian_european_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(-20, 80), c(0, 70), "Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.5 degree resolution")# Asia mapasia_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(60, 180), c(-20, 60), "Fishing Hours in Asia", "0.5 degree resolution")# Print the mapsprint(caribbean_map)print(indian_european_map)print(asia_map)```### Test for spatial auto-correlation ```{r}##------------ Create a non-random test set # Prepare the datadata <- combined_data_with_rasters %>%filter(has_AIS & has_SAR) %>% dplyr::select(total_fishing_hours, total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%na.omit()# Split the data into training and test setsset.seed(123) # for reproducibilitytrain_indices <-sample(1:nrow(data), 0.7*nrow(data))train_data <- data[train_indices, ]test_data <- data[-train_indices, ]# Train the random forest model#set.seed(123) # for reproducibility#rf_timing <- system.time({# rf_model_env <- randomForest(# total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = train_data,# ntree = 500,# importance = TRUE# )#})#cat("Random Forest model training time:", rf_timing["elapsed"], "seconds\n")# Save the new model#rf_model_env_train=rf_model_env#save(rf_model_env_train, file = "rf_model_01deg_with_rasters_train.Rdata")load(here::here("rf_model_01deg_with_rasters_train.Rdata"))# Make predictions on the test settest_data$predictions <-predict(rf_model_env_train, test_data)# Calculate residualstest_data$residuals <- test_data$total_fishing_hours - test_data$predictions# Prepare spatial data for autocorrelation analysistest_data_sf <-st_as_sf(test_data, coords =c("lon_std", "lat_std"), crs =4326)# Create a neighbors listk <-8# You can adjust this numbernb <-knearneigh(st_coordinates(test_data_sf), k = k)nb <-knn2nb(nb)# Create a spatial weights matrixlw <-nb2listw(nb, style="W")# Perform Moran's I test on the test set residualsmoran_test <-moran.test(test_data_sf$residuals, lw)# Create a data frame with the test results moran_results <-data.frame(Statistic =c("Moran I statistic", "Expectation", "Variance", "Standard deviate", "P-value"),Value =c(round(moran_test$estimate[1],2),round(moran_test$estimate[2],2),round(moran_test$estimate[3],2),round(moran_test$statistic,2),ifelse(moran_test$p.value <2.2e-16, "< 2.2e-16", format(moran_test$p.value, scientific =TRUE, digits =4)) ) )# Create and display the tablekable(moran_results, format ="html", col.names =c("Statistic", "Value"),caption ="Moran I Test under randomisation") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE,position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2, width ="15em")```### Spatial cross-validation ```{r}load(here::here("training_data.Rdata"))# Assuming your training_data has lon_std (longitude) and lat_std (latitude) columnscoords <-SpatialPoints(training_data[, c("lon_std", "lat_std")])# Combine your training data with spatial coordinatesspatial_data <-SpatialPointsDataFrame(coords, data = training_data)#Testing to split the coordinates with k-means clustering# Ensure spatial_data is an sf objectif (!inherits(spatial_data, "sf")) { spatial_data_sf <-st_as_sf(spatial_data)} else { spatial_data_sf <- spatial_data}# Extract coordinatescoords <-st_coordinates(spatial_data_sf)# Perform k-means clusteringset.seed(123) # for reproducibilitykmeans_result <-kmeans(coords, centers =6)# Add cluster assignments to the sf objectspatial_data_sf$block <-as.factor(kmeans_result$cluster)# Create the plotggplot() +geom_sf(data = spatial_data_sf, aes(color = block), size =1) +scale_color_viridis_d(name ="Block") +theme_minimal() +labs(title ="Spatial Blocks for 6-Fold Cross-Validation",subtitle ="Points colored by block assignment") +theme(legend.position ="bottom")# Register cores for parallel processing (use 1 less than your total cores)#cl <- makeCluster(detectCores() - 1)#registerDoParallel(cl)# Initialize a list to store results#results <- foreach(i = 1:6, .packages = c("randomForest", "dplyr"), .export = "spatial_data_sf") %dopar% {# Get the indices of the training and test sets for the i-th block# test_indices <- spatial_data_sf$block == i# train_data <- spatial_data_sf[!test_indices, ]# test_data <- spatial_data_sf[test_indices, ]# Train Random Forest on the training data# rf_model <- randomForest(# total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = train_data,# ntree = 500,# importance = TRUE# )# Save the model# model_filename <- paste0("rf_model_fold_", i, ".rds")# saveRDS(rf_model, file = model_filename)# Predict on the test data# predictions <- predict(rf_model, test_data)# Return a list with results and model filename# list(# results = data.frame(observed = test_data$total_fishing_hours, predicted = predictions, fold = i),# model_filename = model_filename# )#}# Stop the cluster once done#stopCluster(cl)# Extract results and model filenames#results_df <- do.call(rbind, lapply(results, function(x) x$results))#model_filenames <- sapply(results, function(x) x$model_filename)# Save results#save(results_df, file = "spatial_cv_results.RData")#save(model_filenames, file = "model_filenames.RData")#load(here::here("spatial_cv_results.RData"))#load(here::here("model_filenames.RData"))# Ensure spatial_data and results_df are aligned#spatial_data <- spatial_data[order(row.names(spatial_data)), ]#results_df <- results_df[order(row.names(results_df)), ]# Get all coordinates#all_coordinates <- coordinates(spatial_data)# Create neighbor list for the entire dataset#k <- 30 # Number of nearest neighbors, adjust as needed#nb_all <- knn2nb(knearneigh(all_coordinates, k = k))# Create spatial weights for the entire dataset#lw_all <- nb2listw(nb_all, style="W", zero.policy = TRUE)# Function to calculate Moran's I#calculate_morans_i <- function(residuals, indices, lw_all) {# Create a logical vector for subsetting# subset_vector <- rep(FALSE, length(lw_all$neighbours))# subset_vector[indices] <- TRUE# Subset the weights list for the current fold# lw_subset <- subset.listw(lw_all, subset_vector, zero.policy = TRUE)# Calculate Moran's I# moran_result <- moran.test(residuals, lw_subset, zero.policy = TRUE)# return(moran_result)#}# Calculate Moran's I for each fold in the spatial cross-validation#morans_i_results <- list()#for (i in 1:6) {# Get the indices for this fold# fold_indices <- which(results_df$fold == i)# Calculate residuals# fold_residuals <- results_df$observed[fold_indices] - results_df$predicted[fold_indices]# Calculate Moran's I for this fold# morans_i_results[[i]] <- calculate_morans_i(fold_residuals, fold_indices, lw_all)# Print progress# print(paste("Completed fold", i))#}#save(morans_i_results, file="morans_i_results.Rdata")load(here::here("morans_i_results.Rdata"))# Print results#print("Moran's I for each fold in spatial cross-validation:")#for (i in 1:5) {# print(paste("Fold", i))# print(morans_i_results[[i]])#}# Extract relevant information from Moran's I resultsmorans_table <-data.frame(Fold =1:6,Moran_I =sapply(morans_i_results, function(x) x$estimate[1]),Expectation =sapply(morans_i_results, function(x) x$estimate[2]),Variance =sapply(morans_i_results, function(x) x$estimate[3]),Std_Deviate =sapply(morans_i_results, function(x) x$statistic),P_Value =sapply(morans_i_results, function(x) x$p.value))# Format the tablemorans_table_formatted <- morans_table %>%mutate(Moran_I =round(Moran_I, 4),Expectation =format(Expectation, scientific =TRUE, digits =4),Variance =format(Variance, scientific =TRUE, digits =4),Std_Deviate =round(Std_Deviate, 2),P_Value =ifelse(P_Value <2.2e-16, "< 2.2e-16", format(P_Value, scientific =TRUE, digits =4)) )# Create and print the table using kable and kable_stylingkable(morans_table_formatted,col.names =c("Fold", "Moran's I", "Expectation", "Variance", "Std. Deviate", "p-value"),caption ="Moran's I Results for Each Fold in Spatial Cross-Validation",align =c('c', 'r', 'r', 'r', 'r', 'r'),format ="html") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2:6, width ="150px")```### Spatially blocked predictions ```{r}# Load necessary librarieslibrary(raster)library(ggplot2)library(viridis)library(dplyr)library(data.table)# Load the model filenames#load(here::here("model_filenames.RData"))# Function to make predictions using all 6 models#make_predictions <- function(data) {# predictions <- matrix(0, nrow = nrow(data), ncol = 6)# for (i in 1:6) {# model <- readRDS(here::here(model_filenames[i]))# predictions[, i] <- predict(model, data)# }# Take the average prediction across all models# rowMeans(predictions)#}# Prepare the prediction data (same as before)#prediction_data <- combined_data_with_rasters %>%# dplyr::select(total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy)# Make predictions using the spatially blocked models#new_predictions <- make_predictions(prediction_data)#save(new_predictions, file=here::here("new_predictions.Rdata"))load(here::here("new_predictions.Rdata"))# Add predictions to the original datasetcombined_data_01deg <- combined_data_01deg %>%mutate(predicted_fishing_hours =case_when( has_AIS ~ total_fishing_hours, has_SAR ~ new_predictions[match(paste(lon_std, lat_std), paste(prediction_data$lon_std, prediction_data$lat_std))],TRUE~0 ) )# Now you can use the same mapping code as before, but it will use the new predictions# Map of predicted fishing hours only map_data <- combined_data_01deg %>%filter(!has_AIS & has_SAR) %>% dplyr::select(lon_std, lat_std, predicted_fishing_hours)#predicted_SAR_only_6RF=map_data#save(predicted_SAR_only_6RF, file="predicted_SAR_only_6RF.Rdata")# Use your create_region_map function to create the maps# Global mappredicted_SAR_only_plot <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-180, 180), c(-90, 90), "Predicted Fishing Hours in Areas with Only SAR Detections", "0.1 degree resolution (Spatially Blocked Model)")# Caribbean mapcaribbean_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Predicted Fishing Hours in the Caribbean", "0.1 degree resolution (Spatially Blocked Model)")# Northwestern Indian Ocean to Western European waters mapindian_european_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.1 degree resolution (Spatially Blocked Model)")# Asia mapasia_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Predicted Fishing Hours in Asia", "0.1 degree resolution (Spatially Blocked Model)")# Print the mapsprint(predicted_SAR_only_plot)print(caribbean_map)print(indian_european_map)print(asia_map)# Create the global map of both original and predicted fishing hourspredicted_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data_01deg, aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Hours (0.1 degree resolution)",subtitle ="Based on AIS data and Spatially Blocked Random Forest predictions from SAR data",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )print(predicted_plot)# Aggregate data to 0.5 degree resolutioncombined_data_05deg <- combined_data_01deg %>%mutate(lon_05deg =round(lon_std *2) /2,lat_05deg =round(lat_std *2) /2 ) %>%group_by(lon_05deg, lat_05deg) %>%summarize(predicted_fishing_hours =sum(predicted_fishing_hours, na.rm =TRUE) ) %>%ungroup()# Global map (0.5 degree)global_map_05deg <-create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", c(-180, 180), c(-90, 90), "Global Fishing Hours", "0.5 degree resolution (Observed and Predicted)")# Caribbean map (0.5 degree)caribbean_map_05deg <-create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", c(-100, -50), c(0, 40), "Fishing Hours in the Caribbean", "0.5 degree resolution (Observed and Predicted)")# Northwestern Indian Ocean to Western European waters map (0.5 degree)indian_european_map_05deg <-create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", c(-20, 80), c(0, 70), "Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.5 degree resolution (Observed and Predicted)")# Asia map (0.5 degree)asia_map_05deg <-create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", c(60, 180), c(-20, 60), "Fishing Hours in Asia", "0.5 degree resolution (Observed and Predicted)")# Print the mapsprint(global_map_05deg)print(caribbean_map_05deg)print(indian_european_map_05deg)print(asia_map_05deg)```### Differences among predictions ```{r}# Load the dataload("predicted_SAR_only_6RF.Rdata")load("predicted_SAR_only_1RF.Rdata")# Prepare the data (assuming you've already loaded and arranged it)combined_data <-bind_rows( predicted_SAR_only_6RF %>% dplyr::select(predicted_fishing_hours) %>%mutate(method ="6RF"), predicted_SAR_only_1RF %>% dplyr::select(predicted_fishing_hours) %>%mutate(method ="1RF"))# 1. Log-transformed density plotlog_density_plot <-ggplot(combined_data, aes(x = predicted_fishing_hours +1, fill = method)) +geom_density(alpha =0.5) +scale_x_log10(labels = scales::comma) +scale_fill_manual(values =c("6RF"="blue", "1RF"="red")) +labs(title ="Distribution of Predicted Fishing Hours (Log Scale)",x ="Predicted Fishing Hours (Log Scale)",y ="Density",fill ="Method") +theme_minimal() +theme(legend.position ="bottom")print(log_density_plot)# Summary statisticssummary_stats <- combined_data %>%group_by(method) %>%summarise(mean =mean(predicted_fishing_hours, na.rm =TRUE),median =median(predicted_fishing_hours, na.rm =TRUE),sd =sd(predicted_fishing_hours, na.rm =TRUE),min =min(predicted_fishing_hours, na.rm =TRUE),max =max(predicted_fishing_hours, na.rm =TRUE),q1 =quantile(predicted_fishing_hours, 0.25, na.rm =TRUE),q3 =quantile(predicted_fishing_hours, 0.75, na.rm =TRUE) )kable(summary_stats, format ="html", digits =2,col.names =c("Method", "Mean", "Median", "SD", "Min", "Max", "Q1", "Q3"),caption ="Summary Statistics of Predicted Fishing Hours by Method") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE) %>%add_header_above(c(" "=1, "Statistics"=7)) %>%column_spec(1, bold =TRUE)``````{r}# Ensure both datasets have the same rows in the same orderpredicted_SAR_only_6RF <- predicted_SAR_only_6RF %>%arrange(lon_std, lat_std)predicted_SAR_only_1RF <- predicted_SAR_only_1RF %>%arrange(lon_std, lat_std)# Calculate differencesdifference_data <- predicted_SAR_only_6RF %>%mutate(pred_diff = predicted_fishing_hours - predicted_SAR_only_1RF$predicted_fishing_hours,rel_diff = (predicted_fishing_hours - predicted_SAR_only_1RF$predicted_fishing_hours) / ((predicted_fishing_hours + predicted_SAR_only_1RF$predicted_fishing_hours) /2) ) %>%na.omit()#summary(difference_data)# Check for NAs#print(sum(is.na(difference_data$pred_diff)))#print(sum(is.na(difference_data$rel_diff)))# Create the world mapworld_map <-map_data("world")# Function to create difference maps for a specific regioncreate_difference_map <-function(data, world_map, lon_col, lat_col, lon_range, lat_range, title, subtitle, var, legend_title) { max_abs_value <-max(abs(data[[var]]), na.rm =TRUE)ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="darkgray", fill ="lightgray", size =0.1) +geom_tile(data = data, aes(x = .data[[lon_col]], y = .data[[lat_col]], fill = .data[[var]])) +scale_fill_gradient2(low ="red", mid ="white", high ="blue", midpoint =0,limits =c(-max_abs_value, max_abs_value),name = legend_title,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3, xlim = lon_range, ylim = lat_range) +theme_minimal() +labs(title = title,subtitle = subtitle,x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )}# Create global difference mapsabs_diff_global <-create_difference_map( difference_data, world_map, "lon_std", "lat_std", c(-180, 180), c(-90, 90), "Absolute Difference in Predicted Fishing Hours", "6RF Model - 1RF Model (0.1 degree resolution)","pred_diff","Difference in Fishing Hours")rel_diff_global <-create_difference_map( difference_data, world_map, "lon_std", "lat_std", c(-180, 180), c(-90, 90), "Relative Difference in Predicted Fishing Hours", "(6RF Model - 1RF Model) / Average (0.1 degree resolution)","rel_diff","Relative Difference")# Create regional difference maps# Caribbeanabs_diff_caribbean <-create_difference_map( difference_data, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Absolute Difference in Predicted Fishing Hours \nin the Caribbean", "6RF Model - 1RF Model (0.1 degree resolution)","pred_diff","Difference in Fishing Hours")rel_diff_caribbean <-create_difference_map( difference_data, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Relative Difference in Predicted Fishing Hours \nin the Caribbean", "(6RF Model - 1RF Model) / Average (0.1 degree resolution)","rel_diff","Relative Difference")# Northwestern Indian Ocean to Western European watersabs_diff_indian_european <-create_difference_map( difference_data, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Absolute Difference in Predicted Fishing Hours from \nNorthern Indian Ocean to Eastern Atlantic", "6RF Model - 1RF Model (0.1 degree resolution)","pred_diff","Difference in Fishing Hours")rel_diff_indian_european <-create_difference_map( difference_data, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Relative Difference in Predicted Fishing Hours from \nNorthern Indian Ocean to Eastern Atlantic", "(6RF Model - 1RF Model) / Average (0.1 degree resolution)","rel_diff","Relative Difference")# Asiaabs_diff_asia <-create_difference_map( difference_data, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Absolute Difference in Predicted Fishing Hours in Asia", "6RF Model - 1RF Model (0.1 degree resolution)","pred_diff","Difference in Fishing Hours")rel_diff_asia <-create_difference_map( difference_data, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Relative Difference in Predicted Fishing Hours in Asia", "(6RF Model - 1RF Model) / Average (0.1 degree resolution)","rel_diff","Relative Difference")# Display the maps#print(abs_diff_global)print(rel_diff_global)#print(abs_diff_caribbean)print(rel_diff_caribbean)#print(abs_diff_indian_european)print(rel_diff_indian_european)#print(abs_diff_asia)print(rel_diff_asia)```